import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
df = pd.read_csv("https://www.mth548.org/_static/kde_marathon_results/marathon_results.csv")
df["tot_minutes"] = pd.to_timedelta(df["Finish"]).dt.total_seconds()/60
df
| Age | M/F | Country | 5K | 10K | 15K | 20K | Half | 25K | 30K | 35K | 40K | Finish | Pace | Overall | Gender | Division | tot_minutes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 25 | M | ETH | 00:14:43 | 00:29:43 | 00:44:57 | 01:00:29 | 01:04:02 | 01:16:07 | 01:32:00 | 01:47:59 | 02:02:39 | 02:09:17 | 00:04:56 | 1 | 1 | 1 | 129.283333 |
| 1 | 30 | M | ETH | 00:14:43 | 00:29:43 | 00:44:58 | 01:00:28 | 01:04:01 | 01:16:07 | 01:31:59 | 01:47:59 | 02:02:42 | 02:09:48 | 00:04:58 | 2 | 2 | 2 | 129.800000 |
| 2 | 29 | M | KEN | 00:14:43 | 00:29:43 | 00:44:57 | 01:00:29 | 01:04:02 | 01:16:07 | 01:32:00 | 01:47:59 | 02:03:01 | 02:10:22 | 00:04:59 | 3 | 3 | 3 | 130.366667 |
| 3 | 28 | M | KEN | 00:14:43 | 00:29:44 | 00:45:01 | 01:00:29 | 01:04:02 | 01:16:07 | 01:32:00 | 01:48:03 | 02:03:47 | 02:10:47 | 00:05:00 | 4 | 4 | 4 | 130.783333 |
| 4 | 32 | M | KEN | 00:14:43 | 00:29:44 | 00:44:58 | 01:00:28 | 01:04:01 | 01:16:07 | 01:32:00 | 01:47:59 | 02:03:27 | 02:10:49 | 00:05:00 | 5 | 5 | 5 | 130.816667 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26293 | 64 | F | USA | 00:50:15 | 01:43:31 | 02:36:53 | 03:32:26 | 03:43:46 | 04:25:53 | 05:19:44 | 06:17:19 | 07:13:34 | 07:38:56 | 00:17:31 | 26594 | 12015 | 269 | 458.933333 |
| 26294 | 61 | F | USA | 00:48:36 | 01:39:39 | 02:39:13 | 03:35:58 | 03:47:55 | 04:32:44 | 05:31:58 | 06:28:56 | 07:26:19 | 07:51:30 | 00:17:59 | 26595 | 12016 | 270 | 471.500000 |
| 26295 | 66 | F | USA | 00:53:03 | 01:47:16 | 02:41:45 | 03:37:07 | 03:48:21 | 04:33:51 | 05:38:56 | 06:38:51 | 07:36:18 | 07:59:33 | 00:18:18 | 26596 | 12017 | 91 | 479.550000 |
| 26296 | 53 | M | USA | 00:49:04 | 01:40:12 | 02:33:31 | 03:31:41 | 03:43:35 | 04:29:20 | 05:31:11 | 06:33:35 | 07:35:38 | 08:00:37 | 00:18:20 | 26597 | 14580 | 2055 | 480.616667 |
| 26297 | 62 | M | USA | 00:40:14 | 01:28:18 | 02:26:46 | 03:28:41 | 03:40:36 | 04:36:06 | 05:43:44 | 06:51:31 | 07:41:28 | 08:06:01 | 00:18:33 | 26598 | 14581 | 898 | 486.016667 |
26298 rows × 18 columns
from scipy.stats import gaussian_kde
kde = gaussian_kde(df["tot_minutes"])
kde(180)
array([0.00732854])
kde.integrate_box(180, 240)
0.6080317849552407
sns.set_theme()
plt.figure(figsize=(10, 5))
x = np.linspace(120, 500, 400)
plt.plot(x, kde(x));
train_frac = 0.5
train_df = df.sample(frac=train_frac, random_state=1)
train_df
| Age | M/F | Country | 5K | 10K | 15K | 20K | Half | 25K | 30K | 35K | 40K | Finish | Pace | Overall | Gender | Division | tot_minutes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5525 | 51 | F | USA | 00:22:10 | 00:44:03 | 01:06:21 | 01:29:02 | 01:33:57 | 01:51:43 | 02:15:15 | 02:39:23 | 03:03:14 | 03:13:58 | 00:07:24 | 5538 | 619 | 8 | 193.966667 |
| 21370 | 46 | F | USA | 00:27:03 | 00:54:21 | 01:24:42 | 01:54:53 | 02:01:05 | 02:24:37 | 02:57:22 | 03:31:12 | 04:03:01 | 04:17:28 | 00:09:50 | 21548 | 9197 | 1464 | 257.466667 |
| 4747 | 42 | M | SIN | 00:21:24 | 00:42:35 | 01:03:53 | 01:25:25 | 01:30:12 | 01:47:41 | 02:10:49 | 02:35:40 | 03:00:07 | 03:10:32 | 00:07:17 | 4757 | 4328 | 766 | 190.533333 |
| 10161 | 39 | F | USA | 00:25:58 | 00:50:59 | 01:15:38 | 01:40:32 | 01:45:59 | 02:05:28 | 02:30:21 | 02:55:14 | 03:19:41 | 03:30:10 | 00:08:01 | 10195 | 2431 | 1935 | 210.166667 |
| 15313 | 51 | M | USA | 00:25:05 | 00:49:23 | 01:14:10 | 01:39:47 | 01:45:25 | 02:05:59 | 02:34:03 | 03:03:15 | 03:33:05 | 03:46:36 | 00:08:39 | 15396 | 9929 | 1438 | 226.600000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25845 | 26 | F | USA | 00:24:59 | 00:49:45 | 01:15:25 | 01:42:03 | 01:48:00 | 02:11:20 | 02:51:15 | 04:13:17 | 05:12:11 | 05:37:01 | 00:12:52 | 26124 | 11749 | 5852 | 337.016667 |
| 10724 | 47 | M | USA | 00:24:21 | 00:48:11 | 01:12:11 | 01:36:28 | 01:41:46 | 02:00:50 | 02:26:08 | 02:52:41 | 03:19:43 | 03:31:54 | 00:08:05 | 10761 | 8037 | 1539 | 211.900000 |
| 24078 | 44 | M | USA | 00:30:23 | 01:01:48 | 01:33:21 | 02:07:02 | 02:14:01 | 02:41:35 | 03:17:55 | 03:54:02 | 04:32:10 | 04:47:48 | 00:10:59 | 24309 | 13595 | 1994 | 287.800000 |
| 13073 | 53 | F | USA | 00:25:28 | 00:50:09 | 01:14:56 | 01:40:11 | 01:45:44 | 02:05:43 | 02:32:16 | 03:00:31 | 03:27:16 | 03:39:09 | 00:08:22 | 13133 | 4095 | 144 | 219.150000 |
| 10840 | 24 | F | USA | 00:24:54 | 00:50:07 | 01:15:26 | 01:40:41 | 01:46:05 | 02:05:48 | 02:31:26 | 02:56:40 | 03:21:04 | 03:32:16 | 00:08:06 | 10877 | 2798 | 2181 | 212.266667 |
13149 rows × 18 columns
test_df = df.drop(train_df.index)
test_df
| Age | M/F | Country | 5K | 10K | 15K | 20K | Half | 25K | 30K | 35K | 40K | Finish | Pace | Overall | Gender | Division | tot_minutes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 30 | M | ETH | 00:14:43 | 00:29:43 | 00:44:58 | 01:00:28 | 01:04:01 | 01:16:07 | 01:31:59 | 01:47:59 | 02:02:42 | 02:09:48 | 00:04:58 | 2 | 2 | 2 | 129.800000 |
| 9 | 33 | M | UKR | 00:15:14 | 00:30:34 | 00:46:05 | 01:01:43 | 01:05:07 | 01:17:18 | 01:33:11 | 01:49:43 | 02:06:16 | 02:13:52 | 00:05:07 | 10 | 10 | 10 | 133.866667 |
| 10 | 33 | M | USA | 00:14:46 | 00:29:50 | 00:45:33 | 01:01:20 | 01:04:48 | 01:17:08 | 01:33:12 | 01:49:52 | 02:06:55 | 02:13:52 | 00:05:07 | 11 | 11 | 11 | 133.866667 |
| 14 | 42 | M | ITA | 00:15:53 | 00:32:17 | 00:48:32 | 01:04:49 | 01:08:21 | 01:21:17 | 01:38:02 | 01:54:55 | 02:11:25 | 02:18:44 | 00:05:18 | 15 | 15 | 1 | 138.733333 |
| 15 | 29 | M | USA | 00:16:08 | 00:32:19 | 00:48:41 | 01:05:01 | 01:08:38 | 01:21:18 | 01:38:01 | 01:54:54 | 02:11:37 | 02:19:12 | 00:05:19 | 16 | 16 | 15 | 139.200000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26287 | 49 | M | USA | 00:39:11 | 01:21:33 | 02:06:32 | 02:57:51 | 03:08:56 | 03:48:37 | 04:40:33 | 05:30:51 | 06:18:53 | 06:39:52 | 00:15:16 | 26588 | 14575 | 2484 | 399.866667 |
| 26288 | 47 | F | USA | 00:43:22 | 01:27:40 | 02:13:36 | 03:01:58 | 03:12:16 | 03:49:07 | 04:41:53 | 05:34:15 | 06:23:00 | 06:42:01 | 00:15:20 | 26589 | 12014 | 1832 | 402.016667 |
| 26295 | 66 | F | USA | 00:53:03 | 01:47:16 | 02:41:45 | 03:37:07 | 03:48:21 | 04:33:51 | 05:38:56 | 06:38:51 | 07:36:18 | 07:59:33 | 00:18:18 | 26596 | 12017 | 91 | 479.550000 |
| 26296 | 53 | M | USA | 00:49:04 | 01:40:12 | 02:33:31 | 03:31:41 | 03:43:35 | 04:29:20 | 05:31:11 | 06:33:35 | 07:35:38 | 08:00:37 | 00:18:20 | 26597 | 14580 | 2055 | 480.616667 |
| 26297 | 62 | M | USA | 00:40:14 | 01:28:18 | 02:26:46 | 03:28:41 | 03:40:36 | 04:36:06 | 05:43:44 | 06:51:31 | 07:41:28 | 08:06:01 | 00:18:33 | 26598 | 14581 | 898 | 486.016667 |
13149 rows × 18 columns
kde = gaussian_kde(train_df["tot_minutes"], bw_method=0.01)
likelihood = kde(test_df["tot_minutes"]).prod()
likelihood
0.0
log_likelihood = np.log10(kde(test_df["tot_minutes"])).sum()
log_likelihood
-29223.116279861904
choices = []
for bw in np.linspace(0.01, 0.2, 20):
kde = gaussian_kde(train_df["tot_minutes"], bw_method=bw)
log_likelihood = np.log10(kde(test_df["tot_minutes"])).sum()
print(f"bw={bw:.2f}, log_like={log_likelihood}")
choices.append((bw, log_likelihood))
bw=0.01, log_like=-29223.116279861904 bw=0.02, log_like=-28815.02844223093 bw=0.03, log_like=-28739.576731511024 bw=0.04, log_like=-28714.39206645263 bw=0.05, log_like=-28703.64705552043 bw=0.06, log_like=-28698.55933011837 bw=0.07, log_like=-28696.240937999693 bw=0.08, log_like=-28695.495815706585 bw=0.09, log_like=-28695.715864167 bw=0.10, log_like=-28696.551643950595 bw=0.11, log_like=-28697.795622141435 bw=0.12, log_like=-28699.327080277988 bw=0.13, log_like=-28701.07954670453 bw=0.14, log_like=-28703.01983770501 bw=0.15, log_like=-28705.134368838317 bw=0.16, log_like=-28707.420328114255 bw=0.17, log_like=-28709.88017636051 bw=0.18, log_like=-28712.518422241672 bw=0.19, log_like=-28715.339911317984 bw=0.20, log_like=-28718.34906860253
choices
[(0.01, -29223.116279861904), (0.02, -28815.02844223093), (0.03, -28739.576731511024), (0.04, -28714.39206645263), (0.05, -28703.64705552043), (0.060000000000000005, -28698.55933011837), (0.06999999999999999, -28696.240937999693), (0.08, -28695.495815706585), (0.09, -28695.715864167), (0.09999999999999999, -28696.551643950595), (0.11, -28697.795622141435), (0.12, -28699.327080277988), (0.13, -28701.07954670453), (0.14, -28703.01983770501), (0.15000000000000002, -28705.134368838317), (0.16, -28707.420328114255), (0.17, -28709.88017636051), (0.18000000000000002, -28712.518422241672), (0.19, -28715.339911317984), (0.2, -28718.34906860253)]
max(choices, key=lambda x: x[1])
(0.08, -28695.495815706585)
kde = gaussian_kde(df["tot_minutes"], bw_method=0.08)
sns.set_theme()
plt.figure(figsize=(10, 5))
x = np.linspace(120, 500, 400)
plt.plot(x, kde(x));
def f(x, y):
return (x**2 + y - 11)**2 + (x + y**2 - 7)**2 - 150
f(10, 7)
11770
x = np.linspace(0, 5, 6)
y = np.linspace(0, 5, 6)
X, Y = np.meshgrid(x, y)
X, Y
(array([[0., 1., 2., 3., 4., 5.],
[0., 1., 2., 3., 4., 5.],
[0., 1., 2., 3., 4., 5.],
[0., 1., 2., 3., 4., 5.],
[0., 1., 2., 3., 4., 5.],
[0., 1., 2., 3., 4., 5.]]),
array([[0., 0., 0., 0., 0., 0.],
[1., 1., 1., 1., 1., 1.],
[2., 2., 2., 2., 2., 2.],
[3., 3., 3., 3., 3., 3.],
[4., 4., 4., 4., 4., 4.],
[5., 5., 5., 5., 5., 5.]]))
plt.plot(X, Y, 'r.');
Z = f(X, Y)
Z
array([[ 20., -14., -76., -130., -116., 50.],
[ -14., -44., -98., -140., -110., 76.],
[ -60., -82., -124., -150., -100., 110.],
[ -82., -92., -118., -124., -50., 188.],
[ -20., -14., -20., -2., 100., 370.],
[ 210., 236., 254., 300., 434., 740.]])
import plotly.graph_objects as go
fig = go.Figure(go.Surface(x=X, y=Y, z=Z))
fig.show()
x = np.linspace(-5, 5, 400)
y = np.linspace(-5, 5, 400)
X, Y = np.meshgrid(x, y)
Z = f(X, Y)
fig = go.Figure(go.Surface(x=X, y=Y, z=Z, colorscale="Picnic"))
fig.show()
plt.figure(figsize=(6, 6))
plt.contour(X, Y, Z, levels=20, colors='k')
<matplotlib.contour.QuadContourSet at 0x7f89971b8700>
plt.figure(figsize=(6, 6))
plt.contourf(X, Y, Z, levels=20, cmap="jet")
<matplotlib.contour.QuadContourSet at 0x7f89975b77f0>
plt.figure(figsize=(6, 6))
cp = plt.contourf(X, Y, Z, levels=20, cmap="seismic", alpha=0.5)
plt.contour(X, Y, Z, levels=20, colors='k')
plt.colorbar(cp)
<matplotlib.colorbar.Colorbar at 0x7f8995cbb460>
from matplotlib.colors import TwoSlopeNorm
plt.figure(figsize=(6, 6))
divnorm = TwoSlopeNorm(vcenter=0)
cp = plt.contourf(X, Y, Z, levels=20, cmap="seismic", norm=divnorm, alpha=0.5)
plt.contour(X, Y, Z, levels=20, colors='k')
plt.colorbar(cp)
<matplotlib.colorbar.Colorbar at 0x7f89b1899520>
plt.figure(figsize=(10, 6))
plt.plot(df["tot_minutes"], df["Age"], 'r.', alpha=0.1)
[<matplotlib.lines.Line2D at 0x7f8995ed24f0>]
data = np.array([[0, 0], [1, 0], [0, 1], [1,1]])
data
array([[0, 0],
[1, 0],
[0, 1],
[1, 1]])
data.T
array([[0, 1, 0, 1],
[0, 0, 1, 1]])
kde = gaussian_kde(data.T, bw_method=0.75)
kde([0.5, 0.7])
array([0.23039736])
x = np.linspace(-1, 2, 400)
y = np.linspace(-1, 2, 400)
X, Y = np.meshgrid(x, y)
Z = kde(np.array([X.flatten(), Y.flatten()])).reshape(X.shape)
Z
array([[0.00102521, 0.00106701, 0.00111018, ..., 0.00111018, 0.00106701,
0.00102521],
[0.00106701, 0.00111052, 0.00115545, ..., 0.00115545, 0.00111052,
0.00106701],
[0.00111018, 0.00115545, 0.00120219, ..., 0.00120219, 0.00115545,
0.00111018],
...,
[0.00111018, 0.00115545, 0.00120219, ..., 0.00120219, 0.00115545,
0.00111018],
[0.00106701, 0.00111052, 0.00115545, ..., 0.00115545, 0.00111052,
0.00106701],
[0.00102521, 0.00106701, 0.00111018, ..., 0.00111018, 0.00106701,
0.00102521]])
fig = go.Figure(go.Surface(x=X, y=Y, z=Z))
fig.show()
plt.figure(figsize=(6,6))
plt.contourf(X, Y, Z, levels=10, cmap="Reds")
plt.contour(X, Y, Z, levels=10, colors="k")
<matplotlib.contour.QuadContourSet at 0x7f89b1feb700>
df[["tot_minutes", "Age"]].T
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 26288 | 26289 | 26290 | 26291 | 26292 | 26293 | 26294 | 26295 | 26296 | 26297 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| tot_minutes | 129.283333 | 129.8 | 130.366667 | 130.783333 | 130.816667 | 130.866667 | 131.333333 | 132.7 | 133.583333 | 133.866667 | ... | 402.016667 | 404.466667 | 416.783333 | 417.833333 | 420.5 | 458.933333 | 471.5 | 479.55 | 480.616667 | 486.016667 |
| Age | 25.000000 | 30.0 | 29.000000 | 28.000000 | 32.000000 | 30.000000 | 32.000000 | 39.0 | 27.000000 | 33.000000 | ... | 47.000000 | 46.000000 | 71.000000 | 57.000000 | 37.0 | 64.000000 | 61.0 | 66.00 | 53.000000 | 62.000000 |
2 rows × 26298 columns
kde = gaussian_kde(df[["tot_minutes", "Age"]].T, bw_method=0.2)
kde([180, 30])
array([0.00024693])
kde([180, 70])
array([6.4040845e-08])
plt.figure(figsize=(10, 6))
x = np.linspace(120, 360, 100)
y = np.linspace(10, 70, 100)
X, Y = np.meshgrid(x, y)
Z = kde(np.array([X.flatten(), Y.flatten()])).reshape(X.shape)
plt.contour(X, Y, Z, levels=10, colors='k', zorder=10)
plt.plot(df["tot_minutes"], df["Age"], 'r.', alpha=0.1)
[<matplotlib.lines.Line2D at 0x7f8997f45b50>]
kde.integrate_box([180, 40], [240, 50])
0.20727237255581568
kde.integrate_box([180, 50], [240, 60])
0.11890598670822185